In [325]:

    
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from statsmodels.tools.eval_measures import rmse
import matplotlib.pylab as plt



In [326]:

    
# Make pylab inline and set the theme to 'ggplot'
plt.style.use('ggplot')
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['plt', 'mod']
`%pylab --no-import-all` prevents importing * from pylab and numpy



In [327]:

    
# Read Boston Housing Data
data = pd.read_csv('Datasets/Housing.csv')



In [328]:

    
# Create a data frame with all the independent features
data_indep = data.drop('medv', axis = 1)

# Create a target vector(vector of dependent variable, i.e. 'medv')
data_dep = data['medv']

# Split data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(data_indep, data_dep,
                                                    test_size = 0.20,
                                                    random_state = 42)

Regression without any Outliers:



In [329]:

    
# Now let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
# Set a random seed so that we can reproduce the results
np.random.seed(32767)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)









    



MSE -> 3.440147



In [330]:

    
# Suppress printing numpy array in scientific notation
np.set_printoptions(suppress=True)

error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)









    Out[330]:





array([   0.03,    0.09,   13.16,    0.08,    0.  ,    1.49,    0.52,
          0.31,    2.2 ,   11.76,    0.39,    0.  ,    0.71,    0.01,
          6.73,   62.99,    2.71,    0.08,   13.18,    3.96,    2.2 ,
         13.17,    0.18,    0.63,    3.91,    4.46,    2.2 ,    1.9 ,
          2.26,    6.1 ,    7.01,    0.24,  111.68,    0.44,   31.79,
          4.04,    0.06,    0.01,   11.08,    0.06,    6.94,    5.05,
         16.06,   12.71,    0.04,    0.  ,    4.9 ,    0.21,   10.28,
         10.61,    5.29,    0.04,    1.43,    6.06,    3.39,    0.11,
          7.18,   20.04,    1.45,    0.09,    4.82,    2.39,    2.19,
          0.05,    0.81,    0.63,    4.83,    4.14,    4.  ,    0.75,
          0.91,    0.65,    0.42,    1.17,    0.63,   12.26,    0.41,
          3.02,    0.89,   51.75,    0.16,   18.5 ,    0.36,    1.43,
          3.31,    8.56,    1.62,    2.8 ,    0.73,    1.59,    0.75,
          2.85,    0.55,    4.29,   48.63,    3.88,  476.98,   59.54,
         12.18,   22.68,    2.86,    0.45])



In [331]:

    
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)









    



MSE -> 2.542019



In [332]:

    
error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)









    Out[332]:





array([  0.02,   1.51,  17.29,   1.49,   2.5 ,   5.38,   0.12,   0.03,
         1.03,  18.  ,   2.44,   1.11,   5.32,   0.31,   1.68,  16.66,
         1.54,   1.86,  24.29,   3.47,   1.  ,  14.71,   0.03,   2.27,
         0.61,   2.82,   3.62,   2.67,   5.87,   9.84,  11.36,   0.11,
        39.7 ,   2.7 ,  21.12,   6.05,   2.57,   0.09,  13.48,   0.56,
         1.96,   4.68,  32.54,  11.9 ,   0.  ,   0.24,   6.82,   0.  ,
         3.35,  15.8 ,   1.78,   0.07,   2.35,   2.28,  18.14,   0.04,
         3.54,  14.06,   3.48,   0.17,   0.57,   0.92,   0.7 ,   1.01,
         0.33,   9.48,   1.98,   1.3 ,   7.62,   7.32,   1.62,   1.61,
         0.06,   0.58,   6.42,   1.81,   0.03,   8.83,   0.81,  34.72,
         0.51,  15.03,   0.94,   0.73,   0.89,   4.12,   0.92,   1.91,
         0.39,   0.2 ,   0.7 ,   0.16,   0.1 ,   3.2 ,  25.71,  12.62,
        84.07,  30.38,   5.81,   7.08,   3.31,   2.21])

As apparent from RMSE errors of L1 and L2 loss functions, Least Squares(L2) outperform L1, when there are no outliers in the data.

Regression with Outliers:



In [333]:

    
# Some statistics about the Housing Data
data.describe()

After looking at the minimum and maximum values of 'medv' column, we can see that the range of values in 'medv' is [5, 50].
Let's add a few Outliers in this Dataset, so that we can see some significant differences with L1 and L2 loss functions.



In [334]:

    
# Get upper and lower bounds[min, max] of all the features
stats = data.describe()
extremes = stats.loc[['min', 'max'],:].drop('medv', axis = 1)
extremes

Now, we are going to generate 5 random samples, such that their values lies in the [min, max] range of respective features.



In [335]:

    
# Set a random seed
np.random.seed(1234)

# Create 5 random values 
rands = np.random.rand(5, 1)
rands









    Out[335]:





array([[ 0.19151945],
       [ 0.62210877],
       [ 0.43772774],
       [ 0.78535858],
       [ 0.77997581]])



In [336]:

    
# Get the 'min' and 'max' rows as numpy array
min_array = np.array(extremes.loc[['min'], :])
max_array = np.array(extremes.loc[['max'], :])

# Find the difference(range) of 'max' and 'min'
range = max_array - min_array
range









    Out[336]:





array([[  88.96988,  100.     ,   27.28   ,    1.     ,    0.486  ,
           5.219  ,   97.1    ,   10.9969 ,   23.     ,  524.     ,
           9.4    ,  396.58   ,   36.24   ]])



In [337]:

    
# Generate 5 samples with 'rands' value
outliers_X = (rands * range) + min_array
outliers_X









    Out[337]:





array([[  17.04578252,   19.15194504,    5.68465061,    0.19151945,
           0.47807845,    4.56054001,   21.49653863,    3.23572024,
           5.40494736,  287.356192  ,   14.40028283,   76.27278363,
           8.67066488],
       [  55.35526271,   62.2108771 ,   17.43112727,    0.62210877,
           0.68734486,    6.80778568,   63.30676167,    7.97086794,
          15.30850173,  512.98499602,   18.44782245,  247.03589642,
          24.27522186],
       [  38.95090441,   43.7727739 ,   12.40121272,    0.43772774,
           0.59773568,    5.84550107,   45.40336346,    5.94324817,
          11.067738  ,  416.36933524,   16.71464075,  173.91406674,
          17.59325326],
       [  69.87957895,   78.53585837,   21.88458216,    0.78535858,
           0.76668427,    7.65978645,   79.15831848,    9.76610981,
          19.06324743,  598.52789787,   19.98237069,  311.77750713,
          30.19139507],
       [  69.40067405,   77.99758081,   21.73774005,    0.77997581,
           0.76406824,    7.63169374,   78.63565097,    9.70691596,
          18.93944359,  595.70732345,   19.9317726 ,  309.64280598,
          29.99632329]])



In [338]:

    
# We will also create some hard coded outliers for 'medv', i.e. our target
medv_outliers = np.array([0, 0, 600, 700, 600])



In [339]:

    
# Let's have a look at the data types of all the columns
# so that we can create our new Dataset compatible with the original one
data_indep.dtypes









    Out[339]:





crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
black      float64
lstat      float64
dtype: object



In [340]:

    
# Change the type of 'chas', 'rad' and 'tax' to rounded of Integers
outliers_X[:, [3, 8, 9]] = np.int64(np.round(outliers_X[:, [3, 8, 9]]))



In [341]:

    
# Finally concatenate our existing 'train_X' and 'train_y' with these outliers
train_X = np.append(train_X, outliers_X, axis = 0)
train_y = np.append(train_y, medv_outliers, axis = 0)



In [342]:

    
# Plot a histogram of 'medv' in train_y
fig = plt.figure(figsize=(13,7))
plt.hist(train_y, bins=50, range = (-10, 800))
fig.suptitle('medv Count', fontsize = 20)
plt.xlabel('medv', fontsize = 16)
plt.ylabel('count', fontsize = 16)









    Out[342]:





<matplotlib.text.Text at 0x7f906589cb10>

You can see there are some clear outliers at 600, 700 and even one or two 'medv' values are 0.
Since, our outliers are in place now, we will once again fit the GradientBoostingRegressor with L1 and L2 Loss function to see the contrast in their performances with outliers.



In [343]:

    
# So let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
np.random.seed(9876)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)









    



MSE -> 7.055568



In [344]:

    
error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)









    Out[344]:





array([    0.04,     0.69,    17.13,     0.01,     0.62,     0.01,
           1.88,     0.17,     0.54,    16.12,     2.4 ,     0.23,
           0.04,     0.  ,     1.62,    83.27,     1.58,     0.23,
          17.81,     1.3 ,     2.77,    12.79,     0.07,     0.45,
           1.59,     7.27,     1.3 ,     2.45,     5.99,     7.88,
           9.02,     0.49,  3396.2 ,     0.01,    28.32,    12.75,
           0.45,     0.  ,    15.67,     0.  ,     4.2 ,     3.72,
          30.43,     5.71,     0.01,     0.18,     7.47,     0.09,
           1.28,    12.06,     2.49,     0.13,     6.22,     2.76,
           2.29,     0.02,     4.91,    16.74,     1.94,     0.27,
           0.25,   363.92,     2.42,     0.63,     0.58,     4.43,
           4.28,     3.15,     1.17,     0.97,     0.73,     1.43,
           0.53,     0.49,     0.81,    16.5 ,     0.07,    14.49,
           1.84,    38.94,     0.17,    16.95,     0.45,     1.42,
           1.97,     6.45,     1.08,     3.42,     0.36,     0.46,
           1.1 ,     3.07,     0.12,     2.18,    46.74,     1.48,
         652.32,    66.97,     9.32,    46.2 ,     3.05,     0.66])



In [345]:

    
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)









    



MSE -> 9.806251



In [346]:

    
error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)









    Out[346]:





array([    0.  ,     6.18,    11.  ,     0.36,     0.45,     1.62,
           0.28,     0.02,     0.59,    22.96,     2.24,     0.06,
           0.98,     0.02,     0.64,     6.52,     1.72,     0.04,
           7.4 ,     3.76,     0.76,    15.28,     0.38,     1.95,
           3.7 ,     9.  ,     4.28,     6.69,     5.83,    10.83,
          17.79,     0.57,   121.92,     2.3 ,    15.77,     4.94,
           0.02,     0.6 ,    13.59,     0.03,    11.32,     1.51,
          16.45,     6.85,     0.07,     0.21,     6.32,     0.12,
           3.41,    12.18,     7.96,     2.12,     2.95,     9.69,
          17.15,     0.4 ,     2.43,     4.46,     1.58,     1.59,
           2.45,     0.66,     2.56,     3.33,     0.43,    13.59,
           6.76,     1.15,     2.87,     3.3 ,     4.7 ,     0.01,
           1.3 ,     0.02,     0.86,  8597.78,     0.07,     1.7 ,
           0.  ,    33.7 ,     0.71,    10.75,     0.03,     0.63,
           1.55,    13.66,     0.89,     2.61,     0.27,     0.05,
           1.12,     0.08,     0.  ,     3.76,    63.48,    10.78,
         475.58,   111.95,    12.11,     5.93,     1.53,     1.99])

With outliers in the dataset, a L2(Loss function) tries to adjust the coefficients features according to these outliers on the expense of other features, since the squared-error is going to be huge for these outliers(for error > 1). On the other hand L1(Least absolute deviation) is quite robust to outliers.
As a result, L2 loss function may result in huge deviations in some of the samples which results in reduced accuracy.

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677082	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat
min	0.00632	0	0.46	0	0.385	3.561	2.9	1.1296	1	187	12.6	0.32	1.73
max	88.97620	100	27.74	1	0.871	8.780	100.0	12.1265	24	711	22.0	396.90	37.97